library('stringr')
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('ggplot2')
df = read.csv('resale-flat-prices-based-on-approval-date-final dataset.csv')
df
Q1.1. Which year between 1990 and 2016 showed a rise in the selling of flats? or the maximum number of resale apartments available in a given year?
# getting year from date
df$month=as.Date(paste(df$month,"-01",sep=""))
df$year = format(as.Date(df$month, format="%Y-%m-%d"),"%Y")
df
group_by_year = df %>% group_by(year) %>% tally()
group_by_year
ggplot(group_by_year, aes(x=year, y=n,fill=year)) +
geom_bar(stat = "identity")+
scale_fill_hue(c = 40) +
scale_x_discrete(guide = guide_axis(angle = 65))+
theme(legend.position="none")

max_resales=max(group_by_year$n)
print(paste("Max no of resales in a an year: ",max_resales))
## [1] "Max no of resales in a an year: 57782"
Q2 From 1990 to2016, which town has the most expensive selling price?
print("All Town in dataset")
## [1] "All Town in dataset"
distinct(df,town)
sum_of_resale_in_town = df %>% group_by(town) %>% summarise(total=mean(resale_price))
sum_of_resale_in_town=sum_of_resale_in_town[!is.na(sum_of_resale_in_town$total), ]
sum_of_resale_in_town
ggplot(sum_of_resale_in_town, aes(x=town, y=total, group = 1)) +
geom_line(color = "#0099f9", size = 2) +
geom_point(color = "#0099f9", size = 5)+
scale_x_discrete(guide = guide_axis(angle = 65))+
ggtitle("Most expensive Resale in a town")

max_expensive=max(sum_of_resale_in_town$total)
print(paste("Most expensive town is PUNGGOL with avg. cost: ",max_expensive))
## [1] "Most expensive town is PUNGGOL with avg. cost: 427489.646479146"
Q3. What sorts of flat models are present in the dataset? and what relation do they have with resale price?
df$flat_model = str_to_lower(df$flat_model)
distinct(df,flat_model)
group_by_flat_model = df %>% group_by(flat_model) %>% summarise(resale=mean(resale_price))
group_by_flat_model = group_by_flat_model[!is.na(group_by_flat_model$resale),]
group_by_flat_model
ggplot(group_by_flat_model, aes(x=flat_model, y=resale,fill=flat_model)) +
geom_bar(stat = "identity")+
scale_fill_hue(c = 40) +
scale_x_discrete(guide = guide_axis(angle = 65))+
theme(legend.position="none")

- Type s1 and s2 flats are more expensive along with premium apartment lofts
- 2 room and simplified along with new generation flats are cheaper
Q4 What is the average, maximum, and minimum selling price of a four-bedroom flat in the dataset over the course of 1990-2016, together with comprehensive information of the town, floor area sqm, storey range, and block number?
four_bed_room = subset(df, flat_type == "4 ROOM")
four_bed_room
max_price = max(four_bed_room$resale_price)
min_price = min(four_bed_room$resale_price)
avg_price = mean(four_bed_room$resale_price)
print(paste("Max price of a four bedroom flat is: ",max_price))
## [1] "Max price of a four bedroom flat is: 990000"
print(paste("Min price of a four bedroom flat is: ",min_price))
## [1] "Min price of a four bedroom flat is: 32500"
print(paste("Avg price of a four bedroom flat is: ",avg_price))
## [1] "Avg price of a four bedroom flat is: 280633.517645056"
boxplot(four_bed_room$resale_price)

ggplot(four_bed_room, aes(x=flat_model, y=resale_price, fill=flat_model)) +
geom_boxplot() +
scale_fill_hue(c = 40) +
scale_x_discrete(guide = guide_axis(angle = 65))

ggplot(four_bed_room, aes(x=year, y=resale_price,fill=floor_area_sqm)) +
geom_point() +
scale_x_discrete(guide = guide_axis(angle = 65))

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
temp = select(four_bed_room,resale_price,lease_commence_date,floor_area_sqm)
myplots <- list() # new empty list
for(i in 1:ncol(temp)){
col <- names(temp)[i]
ggp <- ggplot(temp, aes_string(x = col)) +
geom_histogram(bins = 30, fill = "cornflowerblue", color = "black") +
geom_vline(xintercept = mean(temp[[col]]), col = "red", lwd=1.5)
myplots[[i]] <- ggp # add each plot into plot list
}
multiplot(plotlist = myplots, cols = 2)

Q5 The number of resale apartments for each flat type for 2005 and 2015? Is there a difference between 2005 and 2015 in terms of flat type trends?
group_year_2005 = subset(df,year=="2005")
group_year_2005
group_2005_flat_type = group_year_2005 %>% group_by(flat_type) %>% tally()
group_2005_flat_type
ggplot(group_2005_flat_type, aes(x=flat_type, y=n,fill=flat_type)) +
geom_bar(stat = "identity")+
scale_fill_hue(c = 40) +
scale_x_discrete(guide = guide_axis(angle = 65))+
theme(legend.position="none")

df1 <- group_year_2005 %>%
group_by(flat_type)%>%# Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
df1
ggplot(df1, aes(x = "", y = perc, fill = flat_type)) +
geom_col() +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5)) +
coord_polar(theta = "y")

2015
group_year_2015 = subset(df,year=="2015")
group_year_2015
group_2015_flat_type = group_year_2015 %>% group_by(flat_type) %>% tally()
group_2015_flat_type
ggplot(group_2015_flat_type, aes(x=flat_type, y=n,fill=flat_type)) +
geom_bar(stat = "identity")+
scale_fill_hue(c = 40) +
scale_x_discrete(guide = guide_axis(angle = 65))+
theme(legend.position="none")

df2 <- group_year_2015 %>%
group_by(flat_type)%>%# Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
df2
ggplot(df2, aes(x = "", y = perc, fill = flat_type)) +
geom_col() +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5)) +
coord_polar(theta = "y")

- The trend in 2005 and 2015 in flat_type is almost same
plot(factor(group_2015_flat_type$flat_type),group_2015_flat_type$n,type="l",col="red")
lines(factor(group_2015_flat_type$flat_type),group_2005_flat_type$n,col="green")

Trend in flat_type in 2005 and 2015 is not the same
Q6.Categorize which towns are more expensive and which towns are more affordable for property purchasers in the dataset
mean_of_sales_price = mean(df$resale_price,na.rm=TRUE)
sd_of_sales_price = sd(df$resale_price,na.rm = TRUE)
mean_of_sales_price
## [1] 277848.6
sum_of_resale_in_town$Category[sum_of_resale_in_town$total<=mean_of_sales_price]="Affordable"
## Warning: Unknown or uninitialised column: `Category`.
sum_of_resale_in_town$Category[sum_of_resale_in_town$total>mean_of_sales_price ]="Expensive"
sum_of_resale_in_town
ggplot(sum_of_resale_in_town, aes(x=town, y=total)) +
geom_point(aes(color = factor(Category), shape=factor(Category))) +
scale_x_discrete(guide = guide_axis(angle = 65))
